Set working directory and remove exisiting data

knitr::opts_chunk$set(echo = TRUE)

# Set working directory
setwd("C:\\Users\\Ran\\OneDrive\\cabi-trip-history-data")

# Remove exisiting data
rm(list=ls(all=TRUE))

Load libraries

# Load libraries
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(scales)
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:lubridate':
## 
##     here
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(ggmap)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
## 
##     wind
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Show the raw data

# Public data for Capital Bike Share program in Washington DC 
load("FinalDataNew.rdata")
FinalDataNew <- FinalData
head(FinalDataNew)
##        TotalDurationMs           StartDate             EndDate
## 117692          489000 2010-09-15 14:05:00 2010-09-15 14:13:00
## 117691          970000 2010-09-16 10:11:00 2010-09-16 10:27:00
## 117690           14000 2010-09-17 11:00:00 2010-09-17 11:00:00
## 117689           76000 2010-09-17 11:04:00 2010-09-17 11:05:00
## 117688            9000 2010-09-17 11:05:00 2010-09-17 11:06:00
## 117686            7000 2010-09-17 11:19:00 2010-09-17 11:19:00
##        StartStationNumber       StartStation EndStationNumber
## 117692              31009 27th & Crystal Dr             31009
## 117691              31500 4th & Adams St NE             31500
## 117690              31101    14th & V St NW             31101
## 117689              31101    14th & V St NW             31101
## 117688              31101    14th & V St NW             31101
## 117686              31101    14th & V St NW             31101
##                EndStation BikeNumber AccountType
## 117692 27th & Crystal Dr      W00294      Casual
## 117691 4th & Adams St NE      W00377      Casual
## 117690    14th & V St NW      W00824  Registered
## 117689    14th & V St NW      W00824  Registered
## 117688    14th & V St NW      W00824  Registered
## 117686    14th & V St NW      W00824  Registered

Show the dimension of the raw data

dim(FinalDataNew) # the length of the data is 13,623,103
## [1] 16108666        9

Summarize DC bike share data at different time for data visulization

# Load the starting date and time of all bike rentals
load("SummaryDateAndTime.rdata")
head(SummaryDateAndTime)
##   time_num  time hour day   weekday month year
## 1      785 14:05   14  15 Wednesday     9 2010
## 2      551 10:11   10  16  Thursday     9 2010
## 3      600 11:00   11  17    Friday     9 2010
## 4      604 11:04   11  17    Friday     9 2010
## 5      605 11:05   11  17    Friday     9 2010
## 6      619 11:19   11  17    Friday     9 2010
# Count rental event by year
year_summary <- ddply(SummaryDateAndTime,.(year),summarise,N = length(year))

# Count rental event by month and hour
month_summary <- ddply(SummaryDateAndTime,.(month,hour),summarise, N = length(hour))

# Count rental event by weeday and hour
weekday_summary <- ddply(SummaryDateAndTime,.(weekday,hour),summarise, N = length(hour))

Visualize DC bike share data at different year (2010 only has Q4 data, and 2016 only has Q1, Q2 data)

ggplot(data=year_summary, aes(x=year, y=N)) +
  geom_bar(stat="identity")+
  # Add text at top of bars
  geom_text(data=year_summary,aes(x=year,y=N,label=N),vjust=0)

Visualize DC bike share data at different month

ggplot(SummaryDateAndTime, aes(x = hour, y = N, colour = month)) +
  geom_point(data = month_summary, aes(group = month)) +
  geom_line(data = month_summary, aes(group = month)) +
  scale_x_discrete("hour") +
  scale_y_continuous("N") +
  theme_minimal() +
  ggtitle("People rent more bikes from April to October.") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(plot.title=element_text(size=18))

Visualize the different rental demands at weekday and weekend

ggplot(SummaryDateAndTime, aes(x = hour, y = N, colour = weekday)) +
  geom_point(data = weekday_summary, aes(group = weekday)) +
  geom_line(data = weekday_summary, aes(group = weekday)) +
  scale_x_discrete("hour") +
  scale_y_continuous("N") +
  theme_minimal() +
  ggtitle("Different demands in weekday and weekend.\n") + 
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(plot.title=element_text(size=18))

Create a dataframe with start station number and day

# Create a dataframe with StartStationNumber and Weekday
StartStationinWeek.df <- data.frame(FinalDataNew$StartStationNumber, SummaryDateAndTime$weekday)
colnames(StartStationinWeek.df) <- c('StartStationNumber', 'SummaryWeekday')
head(StartStationinWeek.df)
##   StartStationNumber SummaryWeekday
## 1              31009      Wednesday
## 2              31500       Thursday
## 3              31101         Friday
## 4              31101         Friday
## 5              31101         Friday
## 6              31101         Friday

Group the days by weekday and weekend

weekends <- c('Saturday', 'Sunday') #define Sat and Sun as weekend
# If the day is Saturday or Sunday, labels the day as "weekend", other wise labels it as "weekday"
StartStationinWeek.df$SummaryWeekday <- factor((StartStationinWeek.df$SummaryWeekday %in% weekends), 
       levels=c(TRUE, FALSE), labels=c('weekend', 'weekday')) 
colnames(StartStationinWeek.df) <- c('StartStationNumber', 'WeekdayOrWeekend')
head(StartStationinWeek.df)
##   StartStationNumber WeekdayOrWeekend
## 1              31009          weekday
## 2              31500          weekday
## 3              31101          weekday
## 4              31101          weekday
## 5              31101          weekday
## 6              31101          weekday

Count the rental event by station and weekday/weekend

library(plyr)
SummaryStartStationinWeek.df <- count(StartStationinWeek.df, c('StartStationNumber', 'WeekdayOrWeekend')) 
head(SummaryStartStationinWeek.df)
##   StartStationNumber WeekdayOrWeekend  freq
## 1              31000          weekend  2607
## 2              31000          weekday  6765
## 3              31001          weekend  6751
## 4              31001          weekday 15738
## 5              31002          weekend  8331
## 6              31002          weekday 20980

Find the top rental stations in weekday, weekend, and all week

SummaryStationWeekday.df <- subset(SummaryStartStationinWeek.df[c(1,3)], 
                                  SummaryStartStationinWeek.df$WeekdayOrWeekend == 'weekday' & 
                                    !is.na(SummaryStartStationinWeek.df$StartStationNumber))
SummaryStationWeekend.df <- subset(SummaryStartStationinWeek.df[c(1,3)], 
                                  SummaryStartStationinWeek.df$WeekdayOrWeekend == 'weekend' &
                                    !is.na(SummaryStartStationinWeek.df$StartStationNumber))

# Scale the freq in weekday and weekend
SummaryStationWeekday.df$freq <- SummaryStationWeekday.df$freq/5
SummaryStationWeekend.df$freq <- SummaryStationWeekend.df$freq/2

# Sort the Station by freq (from high to low)
SummaryStationWeekday.df <- SummaryStationWeekday.df[order(-SummaryStationWeekday.df$freq),]
SummaryStationWeekend.df <- SummaryStationWeekend.df[order(-SummaryStationWeekend.df$freq),]

# Find the top 5 rental station in weekday
Top5StationInWeekday.df <- SummaryStationWeekday.df[1:5,]
TopStationInWeekday.df <- cbind(Top5StationInWeekday.df[1], 'Weekday')
colnames(TopStationInWeekday.df)[2] <- 'Week'

# Find the top 5 rental station in weekend
Top5StationInWeekend.df <- SummaryStationWeekend.df[1:5,]
TopStationInWeekend.df <- cbind(Top5StationInWeekend.df[1], 'Weekend')
colnames(TopStationInWeekend.df)[2] <- 'Week'

# Merge the top 5 rental staions in weekday and weekend
TopStationInWeek.df <- merge(TopStationInWeekday.df,TopStationInWeekend.df, by = 'StartStationNumber', all = TRUE)
TopStationInWeek.df$Week.x <- as.character(TopStationInWeek.df$Week.x)

# If the station in included in both weekday and weekend stations label it as top stations all week
TopStationInWeek.df$Week.x[!is.na(TopStationInWeek.df$Week.x) & !is.na(TopStationInWeek.df$Week.y)] <- 'AllWeek'
TopStationInWeek.df$Week.x[is.na(TopStationInWeek.df$Week.x)] <- 'Weekend'
TopStationInWeek.df <- TopStationInWeek.df[1:2]
colnames(TopStationInWeek.df)[2] <- 'TopRentalStations'
TopStationInWeek.df
##   StartStationNumber TopRentalStations
## 1              31101           Weekend
## 2              31200           AllWeek
## 3              31201           AllWeek
## 4              31229           Weekday
## 5              31623           Weekday
## 6              31241           Weekday
## 7              31247           Weekend
## 8              31258           Weekend

Extract the GPS info of stations from the online XML file

Find the overall top 10 stations

# Count rental demand in each hour by start station
HourlyDemandPrediction.df <- count(DemandPredictionDataSet,c(1,3,4,5,7))

# Count rental demand in each hour by end station
HourlyDemandPrediction.end.df <- count(DemandPredictionDataSet,c(9,10,11,13,14))
TopStations <- count(DemandPredictionDataSet,c('StartStationNumber'))
TopStations$StartStationNumber <- as.numeric(as.character(TopStations$StartStationNumber))
# Sort the stations by frequency
TopStations <- TopStations[order(-TopStations$freq), ]
# Get top 10 stations 
Top10Stations <- head(TopStations$StartStationNumber,10)
Top10Stations
##  [1] 31200 31623 31201 31258 31247 31229 31241 31101 31214 31613

Visualize rental demand in Station X by hour and weekday

# Top X station
X <- 1
StationX <- Top10Stations[X]

# Create a dataset for the rental demand in Station X by hour and weekday

# Get the data only for Station X
StationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$StartStationNumber == StationX)

# Count the rental demand in Station X by hour
StationX.summary <- ddply(StationX.df,.(month,weekday,hour),summarise,N = length(hour))
StationX.summary.detail <- ddply(StationX.df,.(year,month,day,weekday,hour),summarise,N = length(hour))

# Count the returned bike in Station X by hour
EndStationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$EndStationNumber == StationX)
EndStationX.summary.detail <- ddply(EndStationX.df,.(end_year,end_month,end_day,end_weekday,end_hour),
                         summarise,N = length(end_hour))
colnames(EndStationX.summary.detail)[1:5] <- c('year','month','day','weekday','hour')

# Count the rental demand in Station X by hour and weekday
StationX.DayAndHour <- aggregate(N ~ weekday + hour, data = StationX.summary, FUN = sum)

# Visualize the rental demand in Station X by hour and weekday
ggplot(StationX.DayAndHour, aes(x = as.factor(hour), y = N, colour = weekday)) +
  geom_point(data = StationX.DayAndHour, aes(group = weekday)) +
  geom_line(data = StationX.DayAndHour, aes(group = weekday)) +
  scale_x_discrete("hour") +
  scale_y_continuous("N") +
  theme_minimal() +
  ggtitle(sprintf("Rental Demand in Station %d", Top10Stations[X])) + 
  theme_minimal() +
  theme(plot.title=element_text(size=18))+
  theme(plot.title = element_text(hjust = 0.5))

Visualize rental demand in Station X by hour and month in 3D

library(plotly)
# Get the rental demand of station X on Mondays at each hour and each month 
Demand <- xtabs(N ~ hour + month, data = subset(StationX.summary, StationX.summary$weekday == 'Monday'))
plot_ly(x = 1:12, z = Demand, type = "surface") %>%
     layout(scene = list(title = "Rental Demand on Saturdays in Station 31623, x = month, y= hour, z1 = count", xaxis = list(title = "Month"), yaxis = list(title = "Hour"), zaxis = list(title = "Demand")))

Prepare functions for create the predictive model

Polynomial model with the input of station number, year, month, weekday, and hour

# Stations that use 12th polynomial model
Model1Station <- c(31623, 31201, 31258, 31247, 31229, 31214, 31101, 31613)

# Stations that use 10th polynomial model
Model2Station <- c(31200, 31241)

# 12th polynomial model for weekday 
weekdayPolyModel1 <- function(input_dataset){
  dataset.out.weekday <- input_dataset
  dataset.out.weekday$year_p2 <- dataset.out.weekday$year^2
  dataset.out.weekday$month_p2 <- dataset.out.weekday$month^2
  dataset.out.weekday$hour_p2 <- dataset.out.weekday$hour^2
  dataset.out.weekday$hour_p3 <- dataset.out.weekday$hour^3
  dataset.out.weekday$hour_p4 <- dataset.out.weekday$hour^4
  dataset.out.weekday$hour_p5 <- dataset.out.weekday$hour^5
  dataset.out.weekday$hour_p6 <- dataset.out.weekday$hour^6
  dataset.out.weekday$hour_p7 <- dataset.out.weekday$hour^7
  dataset.out.weekday$hour_p8 <- dataset.out.weekday$hour^8
  dataset.out.weekday$hour_p9 <- dataset.out.weekday$hour^9
  dataset.out.weekday$hour_p10 <- dataset.out.weekday$hour^10
  dataset.out.weekday$hour_p11 <- dataset.out.weekday$hour^11
  dataset.out.weekday$hour_p12 <- dataset.out.weekday$hour^12
  # Fitting the polynomial regression
  regressor.weekday <- lm(formula = out ~ year+year_p2+month+month_p2+hour+hour_p2+hour_p3+hour_p4
                      +hour_p5+hour_p6+hour_p7+hour_p8+hour_p9+hour_p10+hour_p11+hour_p12,
                       data = dataset.out.weekday)
  # Function returns the regressor for weekday
  return(regressor.weekday)
}

# 10th polynomial model for weekday 
weekdayPolyModel2 <- function(input_dataset){
  dataset.out.weekday <- input_dataset
  dataset.out.weekday$year_p2 <- dataset.out.weekday$year^2
  dataset.out.weekday$month_p2 <- dataset.out.weekday$month^2
  dataset.out.weekday$hour_p2 <- dataset.out.weekday$hour^2
  dataset.out.weekday$hour_p3 <- dataset.out.weekday$hour^3
  dataset.out.weekday$hour_p4 <- dataset.out.weekday$hour^4
  dataset.out.weekday$hour_p5 <- dataset.out.weekday$hour^5
  dataset.out.weekday$hour_p6 <- dataset.out.weekday$hour^6
  dataset.out.weekday$hour_p7 <- dataset.out.weekday$hour^7
  dataset.out.weekday$hour_p8 <- dataset.out.weekday$hour^8
  dataset.out.weekday$hour_p9 <- dataset.out.weekday$hour^9
  dataset.out.weekday$hour_p10 <- dataset.out.weekday$hour^10
  # Fitting the polynomial regression
  regressor.weekday <- lm(formula = out ~ year+year_p2+month+month_p2+hour+hour_p2+hour_p3+hour_p4+
                            hour_p5+hour_p6+hour_p7+hour_p8+hour_p9+hour_p10,
                       data = dataset.out.weekday)
  return(regressor.weekday)
}

# 4th polynomial model for weekend
weekendPolyModel <- function(input_dataset){
  dataset.out.weekday <- input_dataset
  dataset.out.weekend$year_p2 <- dataset.out.weekend$year^2
  dataset.out.weekend$month_p2 <- dataset.out.weekend$month^2
  dataset.out.weekend$hour_p2 <- dataset.out.weekend$hour^2
  dataset.out.weekend$hour_p3 <- dataset.out.weekend$hour^3
  dataset.out.weekend$hour_p4 <- dataset.out.weekend$hour^4
  # Fitting the polynomial regression
  regressor.weekend <- lm(formula = out ~ year+year_p2+month+month_p2+
                                    hour+hour_p2+hour_p3+hour_p4,
                       data = dataset.out.weekend)
  # Function returns the regressor for weekday
  return(regressor.weekend)
}

# Function to select the model for weekday prediction
predictStationXinWeekday <-  function(SelectStation){
  # If the station number is in Model1 then use Model1 (12th poly)
  if (SelectStation %in% Model1Station){
    RegressorWeekday = weekdayPolyModel1(dataset.out.weekday)
  }
  # If the station number is in Model2 then use Model2 (10th poly)
  if (SelectStation %in% Model2Station){
    RegressorWeekday = weekdayPolyModel2(dataset.out.weekday)
  } 
  return(RegressorWeekday)
}

# Function to use model (4th poly) for weekend prediction
predictStationXinWeekend <-  function(SelectStation){
  RegressorWeekend = weekendPolyModel(dataset.out.weekend)
  return(RegressorWeekend)
}

# Function to create the predicted dataset in weekday
predictedWeekdayDemand <- function(SelectStation,Year,Month,Hour){
  # Select the model 1 (12th poly)
  if (SelectStation %in% Model1Station){
    modeldata <-  data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
                            hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4,
                            hour_p5 = Hour^5, hour_p6 = Hour^6, hour_p7 = Hour^7, hour_p8 = Hour^8,
                            hour_p9 = Hour^9, hour_p10 = Hour^10, hour_p11 = Hour^11, hour_p12 = Hour^12)  
  }
  # Select the model 2 (10th poly)
  if (SelectStation %in% Model2Station){
    modeldata <-  data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
                            hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4,
                            hour_p5 = Hour^5, hour_p6 = Hour^6, hour_p7 = Hour^7, hour_p8 = Hour^8,
                            hour_p9 = Hour^9, hour_p10 = Hour^10) 
  }
  # Create the predicted dataset in weekday
  predictedWeekdayData = cbind.data.frame(hour = Hour, 
                                          demand = predict(myPolyModelWeekday,newdata = modeldata))
  # Return the predicted dataset in weekday
  return(predictedWeekdayData)
}

# Function to create the predicted dataset in weekend
predictedWeekendDemand <- function(SelectStation,Year,Month,Hour){
  # Use 4th poly model
  modeldata <-  data.frame(year = Year, year_p2 = Year^2, month = Month, month_p2 = Month^2,
                            hour = Hour, hour_p2 = Hour^2, hour_p3 = Hour^3, hour_p4 = Hour^4)  
  # Create the predicted dataset in weekend
  predictedWeekendData = cbind.data.frame(hour = Hour, 
                                             demand = predict(myPolyModelWeekend,newdata = modeldata))
  # Return the predicted dataset in weekend
  return(predictedWeekendData)
}

Predict the rental demand in Station X

# Select station X
X <- 9
StationX <- Top10Stations[X]

# Create a dataset for the rental demand in Station X by hour and weekday

# Count the rental demand in Station X by hour
StationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$StartStationNumber == StationX)

# Count the rental demand in Station X by hour
StationX.summary <- ddply(StationX.df,.(month,weekday,hour),summarise,N = length(hour))
StationX.summary.detail <- ddply(StationX.df,.(year,month,day,weekday,hour),summarise,N = length(hour))

# Count the returned bike in Station X by hour
EndStationX.df <- subset(DemandPredictionDataSet, DemandPredictionDataSet$EndStationNumber == StationX)
EndStationX.summary.detail <- ddply(EndStationX.df,.(end_year,end_month,end_day,end_weekday,end_hour),
                         summarise,N = length(end_hour))
colnames(EndStationX.summary.detail)[1:5] <- c('year','month','day','weekday','hour')
StationX.demand.detail <- merge.data.frame(StationX.summary.detail, EndStationX.summary.detail,
                                   by = c('year','month','day','weekday','hour'),
                                   all = TRUE)
StationX.demand.detail[is.na(StationX.demand.detail)] <- 0
colnames(StationX.demand.detail)[6:7] <- c('out','In')
StationX.demand.detail$netout <- StationX.demand.detail$out - StationX.demand.detail$In
dataset.out <- StationX.demand.detail[c(1:5,6)]  

# Create the dataset for weekday and weekend
weekend.position <- dataset.out$weekday == 'Saturday' | dataset.out$weekday == 'Sunday'
dataset.out.weekday <- subset(dataset.out, weekend.position == FALSE)
dataset.out.weekend <- subset(dataset.out, weekend.position == TRUE)
PredictYear <- 2015
PredictMonth <- 6
hour_grid <- seq(0, 23, 0.1)

# Predict weekday rental demand in Station X
myPolyModelWeekday <- predictStationXinWeekday(StationX)
summary(myPolyModelWeekday)
## 
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour + 
##     hour_p2 + hour_p3 + hour_p4 + hour_p5 + hour_p6 + hour_p7 + 
##     hour_p8 + hour_p9 + hour_p10 + hour_p11 + hour_p12, data = dataset.out.weekday)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.748 -1.759 -0.326  1.401 16.431 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.262e+05  1.788e+04  -35.03   <2e-16 ***
## year         6.221e+02  1.776e+01   35.03   <2e-16 ***
## year_p2     -1.545e-01  4.410e-03  -35.03   <2e-16 ***
## month        8.755e-01  1.895e-02   46.19   <2e-16 ***
## month_p2    -6.302e-02  1.414e-03  -44.57   <2e-16 ***
## hour        -1.674e+01  7.354e-01  -22.76   <2e-16 ***
## hour_p2      3.371e+01  1.324e+00   25.46   <2e-16 ***
## hour_p3     -2.509e+01  9.403e-01  -26.68   <2e-16 ***
## hour_p4      9.378e+00  3.547e-01   26.44   <2e-16 ***
## hour_p5     -2.040e+00  8.085e-02  -25.23   <2e-16 ***
## hour_p6      2.799e-01  1.190e-02   23.52   <2e-16 ***
## hour_p7     -2.527e-02  1.170e-03  -21.60   <2e-16 ***
## hour_p8      1.525e-03  7.758e-05   19.66   <2e-16 ***
## hour_p9     -6.091e-05  3.429e-06  -17.76   <2e-16 ***
## hour_p10     1.546e-06  9.679e-08   15.97   <2e-16 ***
## hour_p11    -2.256e-08  1.578e-09  -14.29   <2e-16 ***
## hour_p12     1.441e-10  1.131e-11   12.73   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.721 on 33922 degrees of freedom
## Multiple R-squared:  0.3698, Adjusted R-squared:  0.3695 
## F-statistic:  1244 on 16 and 33922 DF,  p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekdayData = predictedWeekdayDemand(StationX,PredictYear,PredictMonth,hour_grid)

# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.out.weekday$year == PredictYear & dataset.out.weekday$month == PredictMonth
plotData <-  dataset.out.weekday[PointToPlot1, c(5,6)]

# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekday <- ggplot()+
  geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
  geom_line(data = predictedWeekdayData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
  ggtitle(paste("Weekday Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
  xlab('Hour') +
  ylab('Demand')+
  theme(plot.title = element_text(hjust = 0.5))
plotWeekday

#Predict weekend rental demand in Station X
myPolyModelWeekend <- predictStationXinWeekend(StationX)
summary(myPolyModelWeekend)
## 
## Call:
## lm(formula = out ~ year + year_p2 + month + month_p2 + hour + 
##     hour_p2 + hour_p3 + hour_p4, data = dataset.out.weekend)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6311 -1.8848 -0.2882  1.5234 17.0175 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.382e+05  3.002e+04  -27.92   <2e-16 ***
## year         8.326e+02  2.982e+01   27.92   <2e-16 ***
## year_p2     -2.068e-01  7.406e-03  -27.92   <2e-16 ***
## month        1.069e+00  3.157e-02   33.86   <2e-16 ***
## month_p2    -7.858e-02  2.365e-03  -33.23   <2e-16 ***
## hour        -2.085e+00  6.590e-02  -31.64   <2e-16 ***
## hour_p2      4.635e-01  1.198e-02   38.70   <2e-16 ***
## hour_p3     -2.786e-02  7.766e-04  -35.88   <2e-16 ***
## hour_p4      5.029e-04  1.649e-05   30.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.924 on 14129 degrees of freedom
## Multiple R-squared:  0.4336, Adjusted R-squared:  0.4333 
## F-statistic:  1352 on 8 and 14129 DF,  p-value: < 2.2e-16
# Get the predicted demand data
predictedWeekendData = predictedWeekendDemand(StationX,PredictYear,PredictMonth,hour_grid)

# Get the actual deamnd data in the month and year
PointToPlot1 <- dataset.out.weekend$year == PredictYear & dataset.out.weekend$month == PredictMonth
plotData <-  dataset.out.weekend[PointToPlot1, c(5,6)]

# Plot the boxplot of the actual data and lineplot of the predicted data
plotWeekend <- ggplot()+
  geom_boxplot(data = plotData,aes(x=hour,y =out, group = hour))+
  geom_line(data = predictedWeekendData, aes(x=hour, y=demand), colour = 'blue',group = 1) +
  ggtitle(paste("Weekend Rental Demand at Station ",StationX, " in ",PredictYear, "-", PredictMonth, sep = "")) +
  xlab('Hour') +
  ylab('Demand')+
  theme(plot.title = element_text(hjust = 0.5))
plotWeekend